資料處理2

林嶔 (Lin, Chin)

Lesson 3

第一節:初級資料轉換(1)

– 這份資料是描述每個人疾病狀況的檔案,我們希望將這份直式資料轉為橫式資料

dat = read.csv("data3_1.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
##    ID Disease
## 1   n     CKD
## 2   f      DM
## 3   m     HTN
## 4   w      DM
## 5   u      DM
## 6   j     HTN
## 7   d     CKD
## 8   l     CKD
## 9   y     HTN
## 10  w     HTN
##   ID   CKD Depression    DM   HTN
## 1  a  TRUE       TRUE  TRUE  TRUE
## 2  b  TRUE       TRUE  TRUE FALSE
## 3  c FALSE       TRUE  TRUE FALSE
## 4  d  TRUE      FALSE FALSE  TRUE
## 5  e  TRUE       TRUE FALSE FALSE
## 6  f  TRUE      FALSE  TRUE  TRUE

第一節:初級資料轉換(2)

– 請各位先回想在前面兩節課學過哪些函數、功能?我保證只使用已經學會的功能就足以應付這個問題了。

  1. 函數「length()」

  2. 函數「levels()」

  3. 各式索引函數

  4. 迴圈函數「for」

第一節:初級資料轉換(3)

– 而函數「levels()」僅能用在因子向量上,所以我們要先確認ID及Disease的屬性

class(dat[,1])
## [1] "factor"
class(dat[,2])
## [1] "factor"

– 很幸運的,在檔案讀進來的時候兩者就都已經是因子向量了,如果不是的話,記得要使用函數「as.factor()」進行轉換

dat[,1] = as.factor(dat[,1])
dat[,2] = as.factor(dat[,2])
levels.sample = levels(dat[,1])
levels.sample
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "p" "r" "s"
## [18] "t" "u" "v" "w" "x" "y" "z"
n.sample = length(levels.sample)
n.sample
## [1] 24
levels.disease = levels(dat[,2])
levels.disease
## [1] "CKD"        "Depression" "DM"         "HTN"
n.disease = length(levels.disease)
n.disease
## [1] 4

第一節:初級資料轉換(4)

– 這裡我們會用到函數「matrix」做一個空的矩陣,在最開始的時候我們可以先在矩陣內都填上0(填什麼並不重要,因為等等都會覆蓋掉)

new.dat = matrix(0, nrow = n.sample, ncol = n.disease+1)
colnames(new.dat) = c("ID", levels.disease)
new.dat[,1] = levels.sample
new.dat
##       ID  CKD Depression DM  HTN
##  [1,] "a" "0" "0"        "0" "0"
##  [2,] "b" "0" "0"        "0" "0"
##  [3,] "c" "0" "0"        "0" "0"
##  [4,] "d" "0" "0"        "0" "0"
##  [5,] "e" "0" "0"        "0" "0"
##  [6,] "f" "0" "0"        "0" "0"
##  [7,] "g" "0" "0"        "0" "0"
##  [8,] "h" "0" "0"        "0" "0"
##  [9,] "i" "0" "0"        "0" "0"
## [10,] "j" "0" "0"        "0" "0"
## [11,] "k" "0" "0"        "0" "0"
## [12,] "l" "0" "0"        "0" "0"
## [13,] "m" "0" "0"        "0" "0"
## [14,] "n" "0" "0"        "0" "0"
## [15,] "p" "0" "0"        "0" "0"
## [16,] "r" "0" "0"        "0" "0"
## [17,] "s" "0" "0"        "0" "0"
## [18,] "t" "0" "0"        "0" "0"
## [19,] "u" "0" "0"        "0" "0"
## [20,] "v" "0" "0"        "0" "0"
## [21,] "w" "0" "0"        "0" "0"
## [22,] "x" "0" "0"        "0" "0"
## [23,] "y" "0" "0"        "0" "0"
## [24,] "z" "0" "0"        "0" "0"

第一節:初級資料轉換(5)

i = 1
dat[dat[,1]==levels.sample[i],]
##    ID    Disease
## 11  a         DM
## 18  a        CKD
## 45  a Depression
## 49  a        HTN
dat[dat[,1]==levels.sample[i],2]
## [1] DM         CKD        Depression HTN       
## Levels: CKD Depression DM HTN
levels.disease %in% dat[dat[,1]==levels.sample[i],2]
## [1] TRUE TRUE TRUE TRUE
new.dat[i,-1] = levels.disease %in% dat[dat[,1]==levels.sample[i],2]
new.dat
##       ID  CKD    Depression DM     HTN   
##  [1,] "a" "TRUE" "TRUE"     "TRUE" "TRUE"
##  [2,] "b" "0"    "0"        "0"    "0"   
##  [3,] "c" "0"    "0"        "0"    "0"   
##  [4,] "d" "0"    "0"        "0"    "0"   
##  [5,] "e" "0"    "0"        "0"    "0"   
##  [6,] "f" "0"    "0"        "0"    "0"   
##  [7,] "g" "0"    "0"        "0"    "0"   
##  [8,] "h" "0"    "0"        "0"    "0"   
##  [9,] "i" "0"    "0"        "0"    "0"   
## [10,] "j" "0"    "0"        "0"    "0"   
## [11,] "k" "0"    "0"        "0"    "0"   
## [12,] "l" "0"    "0"        "0"    "0"   
## [13,] "m" "0"    "0"        "0"    "0"   
## [14,] "n" "0"    "0"        "0"    "0"   
## [15,] "p" "0"    "0"        "0"    "0"   
## [16,] "r" "0"    "0"        "0"    "0"   
## [17,] "s" "0"    "0"        "0"    "0"   
## [18,] "t" "0"    "0"        "0"    "0"   
## [19,] "u" "0"    "0"        "0"    "0"   
## [20,] "v" "0"    "0"        "0"    "0"   
## [21,] "w" "0"    "0"        "0"    "0"   
## [22,] "x" "0"    "0"        "0"    "0"   
## [23,] "y" "0"    "0"        "0"    "0"   
## [24,] "z" "0"    "0"        "0"    "0"

練習1:完成這項工作

##    ID   CKD Depression    DM   HTN
## 1   a  TRUE       TRUE  TRUE  TRUE
## 2   b  TRUE       TRUE  TRUE FALSE
## 3   c FALSE       TRUE  TRUE FALSE
## 4   d  TRUE      FALSE FALSE  TRUE
## 5   e  TRUE       TRUE FALSE FALSE
## 6   f  TRUE      FALSE  TRUE  TRUE
## 7   g  TRUE       TRUE  TRUE FALSE
## 8   h  TRUE       TRUE FALSE  TRUE
## 9   i FALSE       TRUE  TRUE  TRUE
## 10  j FALSE      FALSE FALSE  TRUE
## 11  k  TRUE      FALSE FALSE FALSE
## 12  l  TRUE      FALSE FALSE FALSE
## 13  m  TRUE       TRUE FALSE  TRUE
## 14  n  TRUE       TRUE FALSE  TRUE
## 15  p  TRUE       TRUE FALSE  TRUE
## 16  r FALSE      FALSE  TRUE  TRUE
## 17  s  TRUE      FALSE  TRUE FALSE
## 18  t  TRUE       TRUE FALSE FALSE
## 19  u  TRUE       TRUE  TRUE FALSE
## 20  v FALSE      FALSE  TRUE  TRUE
## 21  w FALSE      FALSE  TRUE  TRUE
## 22  x  TRUE      FALSE FALSE  TRUE
## 23  y  TRUE      FALSE  TRUE  TRUE
## 24  z  TRUE       TRUE FALSE FALSE

練習2:另一種型態的資料

– 這份資料相較於上一份有些微的不同,但處理起來邏輯是類似的

– 請各位試著將這份資料轉換可分析的樣式

##    問卷編號 Disease名稱1 Disease名稱2 Disease名稱3
## 1         1                                       
## 2         2       高血壓          C肝             
## 3         3          C肝       腎結石             
## 4         4       高血壓         氣喘         痛風
## 5         5       高血壓                          
## 6         6   腎功能不全                          
## 7         7       高血壓       心臟病             
## 8         8       高血壓       心臟病          B肝
## 9         9       心臟病                          
## 10       10       高血壓       糖尿病

練習1答案

dat = read.csv("data3_1.csv", header = TRUE, fileEncoding = 'CP950')

levels.disease = levels(dat[,2])
levels.sample = levels(dat[,1])
new.dat = matrix(0, nrow = length(levels.sample), ncol = length(levels.disease)+1)
colnames(new.dat) = c("ID", levels.disease)
new.dat[,1] = levels.sample 
for (i in 1:length(levels.sample)) {
  new.dat[i,-1] = levels.disease%in%dat[dat[,1]==levels.sample[i],2]
}
new.dat = data.frame(new.dat)

new.dat

練習2答案(1)

– 如果直接合併會出問題,必須先轉換成文字格式才能合併

dat = read.csv("data3_2.csv", header = TRUE, fileEncoding = 'CP950')

dat[,2] = as.character(dat[,2])
dat[,3] = as.character(dat[,3])
dat[,4] = as.character(dat[,4])

all_disease = c(dat[,2], dat[,3], dat[,4])
lvl.disease = levels(factor(all_disease))

– 或者是,在讀檔時直接把文字讀成非因子格式

dat = read.csv("data3_2.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE)

all_disease = c(dat[,2], dat[,3], dat[,4])
lvl.disease = levels(factor(all_disease))

練習2答案(2)

lvl.disease = lvl.disease[-1]
lvl.sample = levels(factor(dat[,1]))

new.dat = matrix(0, nrow = length(lvl.sample), ncol = length(lvl.disease)+1)
colnames(new.dat) = c("ID", lvl.disease)
new.dat[,1] = lvl.sample 

for (i in 1:length(lvl.disease)) {
  new.dat[i,-1] = lvl.disease %in% dat[dat[,1]==lvl.sample[i],2:4]
}
new.dat = data.frame(new.dat)

new.dat

第二節 高級資料轉換(1)

– 當然,之後也許你會看到一些函數能加速整個流程,但在面對小筆資料時熟練的使用迴圈能幫忙我們迅速做完資料轉換

– 這份資料是從三軍總醫院生化檢驗值系統截取某10位病患在這段期間內所測得之各式生化值

dat = read.csv("data3_3.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
##    PATNUMBER          COLLECTIONDATE      TESTNAME RESVALUE
## 1       2185 2011/12/12 上午 8:09:00    Creatinine      7.0
## 2       2185 2011/12/12 上午 8:09:00 Total Calcium      7.1
## 3       2185 2011/12/12 上午 8:09:00            Na    137.0
## 4       2185 2011/12/12 上午 8:09:00            IP      7.9
## 5        691 2011/12/12 下午 6:32:00    Creatinine      3.1
## 6       2185 2011/12/29 上午 6:19:00    Creatinine      7.2
## 7       2185 2011/12/29 上午 6:19:00            Na    136.0
## 8        691 2011/12/19 上午 4:38:00    Creatinine      8.0
## 9        691 2011/12/19 上午 4:38:00            Na    137.0
## 10      2185 2011/12/19 上午 8:47:00    Creatinine      8.1
##      PATNUMBER COLLECTIONDATE            Albumin Albumin body fluid AST
## [1,] "175"     "2011/10/1 上午 8:24:00"  NA      NA                 NA 
## [2,] "175"     "2011/10/5 下午 4:46:00"  NA      NA                 NA 
## [3,] "175"     "2011/10/6 上午 9:01:00"  NA      NA                 NA 
## [4,] "175"     "2011/10/8 上午 6:42:00"  NA      NA                 NA 
## [5,] "175"     "2011/11/10 上午 9:01:00" NA      NA                 NA 
## [6,] "175"     "2011/11/10 下午 1:25:00" NA      NA                 NA 
##      BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA  NA        NA                "3.7"      NA               NA     
## [2,] NA  NA        NA                "3.2"      NA               NA     
## [3,] NA  NA        NA                NA         NA               NA     
## [4,] NA  NA        NA                "3.4"      NA               NA     
## [5,] NA  NA        NA                NA         NA               NA     
## [6,] NA  NA        NA                NA         NA               NA     
##      HDL-Cholesterol IP    K  LDL-Cholesterol Na    Total Calcium
## [1,] NA              "4.3" NA NA              "138" "7.3"        
## [2,] NA              NA    NA NA              "139" NA           
## [3,] NA              "4.5" NA NA              NA    "7.8"        
## [4,] NA              NA    NA NA              NA    NA           
## [5,] NA              "4.5" NA NA              NA    "7.3"        
## [6,] NA              NA    NA NA              NA    NA           
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] "342"             "335"        NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] "342"             "326"        NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] NA            NA               NA              NA          
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] "0.6"         "28.3"           "39.1"          "48"        
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA

第二節 高級資料轉換(2)

– 一樣,先取得這份資料的基本資訊

levels.TESTNAME = levels(dat[,3])
levels.TESTNAME
##  [1] "Albumin"            "Albumin body fluid" "AST"               
##  [4] "BUN"                "BUN Fluid"          "Cholesterol Fluid" 
##  [7] "Creatinine"         "Creatinine Fluid"   "GLU(AC)"           
## [10] "HDL-Cholesterol"    "IP"                 "K"                 
## [13] "LDL-Cholesterol"    "Na"                 "Total Calcium"     
## [16] "Total Cholesterol"  "Triglyceride"       "Triglycerol Fluid" 
## [19] "Uric Acid"          "urine Calcium"      "urine Phosphorus"  
## [22] "urine Potassium"    "urine Sodium"       "urine Uric Acid"
n.TESTNAME = length(levels.TESTNAME)
n.TESTNAME
## [1] 24
levels.PATNUMBER = levels(as.factor(dat[,1]))
levels.PATNUMBER
##  [1] "175"  "356"  "691"  "1332" "1350" "1654" "1826" "2074" "2154" "2185"
n.PATNUMBER = length(levels.PATNUMBER)
n.PATNUMBER
## [1] 10

第二節 高級資料轉換(3)

– 在寫迴圈時,我習慣先在起頭令迴圈變數為1,如果這段以後能執行,那應該整個迴圈都不會有問題

i = 1
subdat = dat[dat[,1]==levels.PATNUMBER[i],]
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
n.COLLECTIONDATE
## [1] 1532
subdat[,2] = as.factor(as.character(subdat[,2]))
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
n.COLLECTIONDATE
## [1] 132

第二節 高級資料轉換(4)

– 第一欄填ID,第二欄填上這個人所有測量的時間點

submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)

submatrix[,1] = levels.PATNUMBER[i]
submatrix[,2] = levels.COLLECTIONDATE

head(submatrix)
##      PATNUMBER COLLECTIONDATE            Albumin Albumin body fluid AST
## [1,] "175"     "2011/10/1 上午 8:24:00"  NA      NA                 NA 
## [2,] "175"     "2011/10/5 下午 4:46:00"  NA      NA                 NA 
## [3,] "175"     "2011/10/6 上午 9:01:00"  NA      NA                 NA 
## [4,] "175"     "2011/10/8 上午 6:42:00"  NA      NA                 NA 
## [5,] "175"     "2011/11/10 上午 9:01:00" NA      NA                 NA 
## [6,] "175"     "2011/11/10 下午 1:25:00" NA      NA                 NA 
##      BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA  NA        NA                NA         NA               NA     
## [2,] NA  NA        NA                NA         NA               NA     
## [3,] NA  NA        NA                NA         NA               NA     
## [4,] NA  NA        NA                NA         NA               NA     
## [5,] NA  NA        NA                NA         NA               NA     
## [6,] NA  NA        NA                NA         NA               NA     
##      HDL-Cholesterol IP K  LDL-Cholesterol Na Total Calcium
## [1,] NA              NA NA NA              NA NA           
## [2,] NA              NA NA NA              NA NA           
## [3,] NA              NA NA NA              NA NA           
## [4,] NA              NA NA NA              NA NA           
## [5,] NA              NA NA NA              NA NA           
## [6,] NA              NA NA NA              NA NA           
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] NA                NA           NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] NA                NA           NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] NA            NA               NA              NA          
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] NA            NA               NA              NA          
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA

– 同樣的,我們先看第一個時間點,注意迴圈變數不要重複,所以剛剛設i,這次我們設j

j = 1
subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
subsubdat
##      PATNUMBER         COLLECTIONDATE      TESTNAME RESVALUE
## 3993       175 2011/10/1 上午 8:24:00            IP      4.3
## 3994       175 2011/10/1 上午 8:24:00 Total Calcium      7.3
## 3995       175 2011/10/1 上午 8:24:00    Creatinine      3.7
## 3996       175 2011/10/1 上午 8:24:00            Na    138.0

第二節 高級資料轉換(5)

  1. 找出這列是描述哪個生化值
  2. 找出在submatrix中這個生化值的位置
  3. 把value填入那個位置

– 函數「which()」可以幫我們找位置

k = 1
NAME = subsubdat[k,3]
NAME
## [1] IP
## 24 Levels: Albumin Albumin body fluid AST BUN ... urine Uric Acid
position = which(NAME == levels.TESTNAME) + 2 
position
## [1] 13
submatrix[j, position] = subsubdat[k,4]
head(submatrix)
##      PATNUMBER COLLECTIONDATE            Albumin Albumin body fluid AST
## [1,] "175"     "2011/10/1 上午 8:24:00"  NA      NA                 NA 
## [2,] "175"     "2011/10/5 下午 4:46:00"  NA      NA                 NA 
## [3,] "175"     "2011/10/6 上午 9:01:00"  NA      NA                 NA 
## [4,] "175"     "2011/10/8 上午 6:42:00"  NA      NA                 NA 
## [5,] "175"     "2011/11/10 上午 9:01:00" NA      NA                 NA 
## [6,] "175"     "2011/11/10 下午 1:25:00" NA      NA                 NA 
##      BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA  NA        NA                NA         NA               NA     
## [2,] NA  NA        NA                NA         NA               NA     
## [3,] NA  NA        NA                NA         NA               NA     
## [4,] NA  NA        NA                NA         NA               NA     
## [5,] NA  NA        NA                NA         NA               NA     
## [6,] NA  NA        NA                NA         NA               NA     
##      HDL-Cholesterol IP    K  LDL-Cholesterol Na Total Calcium
## [1,] NA              "4.3" NA NA              NA NA           
## [2,] NA              NA    NA NA              NA NA           
## [3,] NA              NA    NA NA              NA NA           
## [4,] NA              NA    NA NA              NA NA           
## [5,] NA              NA    NA NA              NA NA           
## [6,] NA              NA    NA NA              NA NA           
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] NA                NA           NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] NA                NA           NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] NA            NA               NA              NA          
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] NA            NA               NA              NA          
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA

第二節 高級資料轉換(6)

subsubdat #先看看subsubdat裡面有哪些
##      PATNUMBER         COLLECTIONDATE      TESTNAME RESVALUE
## 3993       175 2011/10/1 上午 8:24:00            IP      4.3
## 3994       175 2011/10/1 上午 8:24:00 Total Calcium      7.3
## 3995       175 2011/10/1 上午 8:24:00    Creatinine      3.7
## 3996       175 2011/10/1 上午 8:24:00            Na    138.0
for (k in 1:nrow(subsubdat)) {
  NAME = subsubdat[k,3]
  position = which(NAME == levels.TESTNAME) + 2
  submatrix[j, position] = subsubdat[k,4]
}
head(submatrix)
##      PATNUMBER COLLECTIONDATE            Albumin Albumin body fluid AST
## [1,] "175"     "2011/10/1 上午 8:24:00"  NA      NA                 NA 
## [2,] "175"     "2011/10/5 下午 4:46:00"  NA      NA                 NA 
## [3,] "175"     "2011/10/6 上午 9:01:00"  NA      NA                 NA 
## [4,] "175"     "2011/10/8 上午 6:42:00"  NA      NA                 NA 
## [5,] "175"     "2011/11/10 上午 9:01:00" NA      NA                 NA 
## [6,] "175"     "2011/11/10 下午 1:25:00" NA      NA                 NA 
##      BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA  NA        NA                "3.7"      NA               NA     
## [2,] NA  NA        NA                NA         NA               NA     
## [3,] NA  NA        NA                NA         NA               NA     
## [4,] NA  NA        NA                NA         NA               NA     
## [5,] NA  NA        NA                NA         NA               NA     
## [6,] NA  NA        NA                NA         NA               NA     
##      HDL-Cholesterol IP    K  LDL-Cholesterol Na    Total Calcium
## [1,] NA              "4.3" NA NA              "138" "7.3"        
## [2,] NA              NA    NA NA              NA    NA           
## [3,] NA              NA    NA NA              NA    NA           
## [4,] NA              NA    NA NA              NA    NA           
## [5,] NA              NA    NA NA              NA    NA           
## [6,] NA              NA    NA NA              NA    NA           
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] NA                NA           NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] NA                NA           NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] NA            NA               NA              NA          
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] NA            NA               NA              NA          
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA

第二節 高級資料轉換(7)

for (j in 1:n.COLLECTIONDATE) {
  subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
  for (k in 1:nrow(subsubdat)) {
    NAME = subsubdat[k,3]
    position = which(NAME == levels.TESTNAME) + 2
    submatrix[j, position] = subsubdat[k,4]
  }
}

head(submatrix)
##      PATNUMBER COLLECTIONDATE            Albumin Albumin body fluid AST
## [1,] "175"     "2011/10/1 上午 8:24:00"  NA      NA                 NA 
## [2,] "175"     "2011/10/5 下午 4:46:00"  NA      NA                 NA 
## [3,] "175"     "2011/10/6 上午 9:01:00"  NA      NA                 NA 
## [4,] "175"     "2011/10/8 上午 6:42:00"  NA      NA                 NA 
## [5,] "175"     "2011/11/10 上午 9:01:00" NA      NA                 NA 
## [6,] "175"     "2011/11/10 下午 1:25:00" NA      NA                 NA 
##      BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA  NA        NA                "3.7"      NA               NA     
## [2,] NA  NA        NA                "3.2"      NA               NA     
## [3,] NA  NA        NA                NA         NA               NA     
## [4,] NA  NA        NA                "3.4"      NA               NA     
## [5,] NA  NA        NA                NA         NA               NA     
## [6,] NA  NA        NA                NA         NA               NA     
##      HDL-Cholesterol IP    K  LDL-Cholesterol Na    Total Calcium
## [1,] NA              "4.3" NA NA              "138" "7.3"        
## [2,] NA              NA    NA NA              "139" NA           
## [3,] NA              "4.5" NA NA              NA    "7.8"        
## [4,] NA              NA    NA NA              NA    NA           
## [5,] NA              "4.5" NA NA              NA    "7.3"        
## [6,] NA              NA    NA NA              NA    NA           
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] "342"             "335"        NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] "342"             "326"        NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] NA            NA               NA              NA          
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] "0.6"         "28.3"           "39.1"          "48"        
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA

第二節 高級資料轉換(8)

– 下列這串程式碼可以獲得一個完整的submatrix

i = 1

subdat = dat[dat[,1]==levels.PATNUMBER[i],]
subdat[,2] = as.factor(as.character(subdat[,2]))
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
n.COLLECTIONDATE

submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)

submatrix[,1] = levels.PATNUMBER[i]
submatrix[,2] = levels.COLLECTIONDATE

for (j in 1:n.COLLECTIONDATE) {
  subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
  for (k in 1:nrow(subsubdat)) {
    NAME = subsubdat[k,3]
    position = which(NAME == levels.TESTNAME) + 2
    submatrix[j, position] = subsubdat[k,4]
  }
}
final.data = NULL
final.data
## NULL
final.data = rbind(final.data, submatrix)
head(final.data)
##      PATNUMBER COLLECTIONDATE            Albumin Albumin body fluid AST
## [1,] "175"     "2011/10/1 上午 8:24:00"  NA      NA                 NA 
## [2,] "175"     "2011/10/5 下午 4:46:00"  NA      NA                 NA 
## [3,] "175"     "2011/10/6 上午 9:01:00"  NA      NA                 NA 
## [4,] "175"     "2011/10/8 上午 6:42:00"  NA      NA                 NA 
## [5,] "175"     "2011/11/10 上午 9:01:00" NA      NA                 NA 
## [6,] "175"     "2011/11/10 下午 1:25:00" NA      NA                 NA 
##      BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA  NA        NA                "3.7"      NA               NA     
## [2,] NA  NA        NA                "3.2"      NA               NA     
## [3,] NA  NA        NA                NA         NA               NA     
## [4,] NA  NA        NA                "3.4"      NA               NA     
## [5,] NA  NA        NA                NA         NA               NA     
## [6,] NA  NA        NA                NA         NA               NA     
##      HDL-Cholesterol IP    K  LDL-Cholesterol Na    Total Calcium
## [1,] NA              "4.3" NA NA              "138" "7.3"        
## [2,] NA              NA    NA NA              "139" NA           
## [3,] NA              "4.5" NA NA              NA    "7.8"        
## [4,] NA              NA    NA NA              NA    NA           
## [5,] NA              "4.5" NA NA              NA    "7.3"        
## [6,] NA              NA    NA NA              NA    NA           
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] "342"             "335"        NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] "342"             "326"        NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] NA            NA               NA              NA          
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] "0.6"         "28.3"           "39.1"          "48"        
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA

第二節 高級資料轉換(9)

levels.TESTNAME = levels(dat[,3])
n.TESTNAME = length(levels.TESTNAME)
levels.PATNUMBER = levels(as.factor(dat[,1]))
n.PATNUMBER = length(levels.PATNUMBER)

final.data = NULL

for (i in 1:n.PATNUMBER) {
  subdat = dat[dat[,1]==levels.PATNUMBER[i],]
  subdat[,2] = as.factor(as.character(subdat[,2]))
  levels.COLLECTIONDATE = levels(subdat[,2])
  n.COLLECTIONDATE = length(levels.COLLECTIONDATE)

  submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
  colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)

  submatrix[,1] = levels.PATNUMBER[i]
  submatrix[,2] = levels.COLLECTIONDATE

  for (j in 1:n.COLLECTIONDATE) {
    subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
    for (k in 1:nrow(subsubdat)) {
      NAME = subsubdat[k,3]
      position = which(NAME == levels.TESTNAME) + 2
      submatrix[j, position] = subsubdat[k,4]
    }
  }
  
  final.data = rbind(final.data, submatrix)
}

head(final.data)

練習3:家庭作業

– 這次,除了檔案更大以外,檔案的最後還有參考值。如果你的值位於參考值內,那就是正常,否則則是過高。

– 我們這次不要填數值,而是填入正常(TRUE)或異常(FALSE)!

dat = read.csv("data3_4.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
##    PATNUMBER SEX          COLLECTIONDATE          TESTNAME RESVALUE  UNITS
## 1        180   1 2011/12/11 上午 5:10:00                Na    131.0 mmol/L
## 2        589   1 2011/12/11 上午 6:37:00        Creatinine      3.8  mg/dL
## 3        589   1 2011/12/11 上午 6:37:00                Na    138.0 mmol/L
## 4       1015   1 2011/12/12 上午 7:38:00 Total Cholesterol    158.0  mg/dL
## 5       1015   1 2011/12/12 上午 7:38:00        Creatinine      1.5  mg/dL
## 6       1015   1 2011/12/12 上午 7:38:00      Triglyceride    140.0  mg/dL
## 7       1015   1 2011/12/12 上午 7:38:00                Na    143.0 mmol/L
## 8        480   2 2011/12/12 上午 7:41:00      Triglyceride    153.0  mg/dL
## 9        480   2 2011/12/12 上午 7:41:00                Na    139.0 mmol/L
## 10       480   2 2011/12/12 上午 7:41:00 Total Cholesterol    211.0  mg/dL
##    MINIMUM MAXIMUM
## 1    136.0   145.0
## 2      0.7     1.2
## 3    136.0   145.0
## 4       NA   200.0
## 5      0.7     1.2
## 6       NA   200.0
## 7    136.0   145.0
## 8       NA   200.0
## 9    136.0   145.0
## 10      NA   200.0

– 函數「Sys.sleep()」是讓系統休息,你不需要將他加入你的迴圈內

n = 100
pb = txtProgressBar(max = n, style=3)
for(i in 1:n) {
  Sys.sleep(0.1)
  setTxtProgressBar(pb, i)
}
close(pb)

練習3答案

levels.TESTNAME = levels(dat[,'TESTNAME'])
n.TESTNAME = length(levels.TESTNAME)
levels.PATNUMBER = levels(as.factor(dat[,'PATNUMBER']))
n.PATNUMBER = length(levels.PATNUMBER)

final.data = NULL

pb = txtProgressBar(max = n.PATNUMBER, style=3)

for (i in 1:n.PATNUMBER) {
  subdat = dat[dat[,'PATNUMBER']==levels.PATNUMBER[i],]
  subdat[,'COLLECTIONDATE'] = as.factor(as.character(subdat[,'COLLECTIONDATE']))
  levels.COLLECTIONDATE = levels(subdat[,'COLLECTIONDATE'])
  n.COLLECTIONDATE = length(levels.COLLECTIONDATE)

  submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
  colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)

  submatrix[,1] = levels.PATNUMBER[i]
  submatrix[,2] = levels.COLLECTIONDATE

  for (j in 1:n.COLLECTIONDATE) {
    subsubdat = subdat[subdat[,'COLLECTIONDATE']==levels.COLLECTIONDATE[j],]
    for (k in 1:nrow(subsubdat)) {
      NAME = subsubdat[k,'TESTNAME']
      position = which(NAME == levels.TESTNAME) + 2
      VALUE = subsubdat[k,'RESVALUE']
      MINIMUM = subsubdat[k,'MINIMUM']
      MAXIMUM = subsubdat[k,'MAXIMUM']
      if (is.na(MINIMUM)) {MINIMUM = -Inf}
      if (is.na(MAXIMUM)) {MAXIMUM = Inf}
      submatrix[j, position] = (VALUE >= MINIMUM & VALUE <= MAXIMUM)
    }
  }
  
  final.data = rbind(final.data, submatrix)
  
  setTxtProgressBar(pb, i)
  
}

close(pb)

head(final.data)

小結

  1. 更熟練的組合現有的函數
  2. 更熟練的利用迴圈做大量重複的事
  3. 對於資料處理時的事前規劃